---
title: "Reboredo Segovia et al.2022 R script"
updated: 1/31/2022
output: html_notebook
---

```{r open libraries}

# packages

library(MASS)
library(vcd)
library(car)
library(tidyverse)
library(ggplot2)
library(ggfortify)
library(AER)
library(DHARMa)
library(Rfit)
library(leaps)
library(caret)
library(rsq)
library(visreg)
library(tidyr)
library(dplyr)
library(broom)
library(pander)
library(gridExtra)

# functions 

# inverse hyperbolic sine function
ihs <- function(x) {
  y <- log(x + sqrt(x ^ 2 + 1))
  return(y)
}

```

```{r full dataset and variable distributions}

# load dataset
MUN_dat <-read.csv('~/Dropbox/PhD/Paper_2/Data_tab/MUN_data_full_9_21_IC.csv')

# index by Municipality code

rownames(MUN_dat) <-MUN_dat$MUN_COD_SHP

# transform variables

MUN_dat$logVict <- log(MUN_dat$Vict_2016)
MUN_dat$logIUA_AS_mea <- log(MUN_dat$IUA_AS_mea)
MUN_dat$P_informality <- MUN_dat$P_informality * 100 # percent instead of propotion
MUN_dat$logIUA_AS_mea <- log(MUN_dat$IUA_AS_mea)
MUN_dat$ihs_ha_best <- ihs(MUN_dat$ha_best)
MUN_dat$ihs_valor_imp <- ihs(MUN_dat$valor_imp)
MUN_dat$ihs_valor_inf <- ihs(MUN_dat$valor_inf)
MUN_dat$logAccess <- log(MUN_dat$Access)
MUN_dat$logAVG_MUN_COST <- log(MUN_dat$AVG_MUN_COST)
MUN_dat$logIC <- log(MUN_dat$MUN_AVG_93_16)
MUN_dat$logICM <- log(MUN_dat$DEP_AVG_MUN)
MUN_dat$logICI <- log(MUN_dat$DEP_AVG_IUA)
MUN_dat$logICA <- log(MUN_dat$DEP_AVG_HA)
MUN_dat$logICP <- log(MUN_dat$DEP_AVG_POP)
MUN_dat$log_Pop <-log(MUN_dat$Pop_05)
MUN_dat$log_MUN_ha <-log(MUN_dat$MUN_ha)

# Create a weighted by area outcome variable

MUN_dat$Parcel_ha <- MUN_dat$Parcel/MUN_dat$log_MUN_ha
MUN_dat$Parcel_ha <- as.integer(MUN_dat$Parcel_ha)
MUN_dat$ihs_ha_best_ha <- MUN_dat$ihs_ha_best/MUN_dat$log_MUN_ha
MUN_dat$ihs_valor_imp_ha <- MUN_dat$ihs_valor_imp/MUN_dat$log_MUN_ha
MUN_dat$ihs_valor_inf_ha <- MUN_dat$ihs_valor_inf/MUN_dat$log_MUN_ha

# Export csv file for all data, including transformed
  
write.csv(MUN_dat,"~/Dropbox/PhD/Paper_2/Data_tab/MUN_dat_all_9_21_21.csv", row.names = TRUE)

# Pare down to only municipalities with all data fields

MUN_dat <- MUN_dat[which(MUN_dat$Andes_MUN == 1),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$P_informality),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$Metro),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$Vict_2016),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$IUA_AS_mea),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$NOM_DEP),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$No_Source),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$logVict),]
MUN_dat <- MUN_dat[!is.na(MUN_dat$NBI),]

# plot histograms with reduced dataset

# model variables
Factors <- MUN_dat %>% dplyr::select(Parcel,ha_best,valor_imp,P_informality,logVict,Metro,logIUA_AS_mea,NBI,No_Source, logAccess, ihs_ha_best,ihs_valor_imp,MUN_AVG_93_16,logIC,logICI,logICM,logICA,logICP,log_Pop,log_MUN_ha,ZS_ECOS_T_LIN,ZS_ECOS_T_LOG)

# Show histograms of the different variables and their transformations
pdf(paste('~/Dropbox/PhD/Paper_2/Figures/Subset_distributions/Complete_dataset.pdf',sep='')) 
par(mfrow=c(3, 3))
for (i in 1:length(Factors)) {
  colnames <- dimnames(Factors)[[2]]
  dt <- density(Factors[,i]) #treated
  hist(Factors[,i], xlim=c(0, max(Factors[,i])),ylim = c(0,max(dt$y)),
       breaks =40,
       main=colnames[i], probability=TRUE, col="gray", border="white",
       xlab = colnames[i])
  d <- density(Factors[,i])
  lines(d, col="red")
}
dev.off()

Save_dat <- MUN_dat
```

```{r choosing between poisson and negative binomial}

# Poisson glm

#MOD_p <- glm(Parcel ~ P_informality + logAVG1, data = MUN_dat,family = 'poisson')
#summary(MOD_p)
#autoplot(MOD_p)
#dispersiontest(MOD_p)

# Negative binomial glm

full.model <- glm.nb(Parcel ~ P_informality + logVict + Metro + logIUA_AS_mea + NBI + No_Source + logAccess + logIC, data = MUN_dat, link = log)
#summary(full.model)
#autoplot(full.model)

step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE)
summary(step.model)
autoplot(step.model)

```

```{r export subset distributions}
# Export variable distributions for all data subsets

MUN_dat <- Save_dat

p_all <-read_csv('~/Dropbox/PhD/Paper_2/Data_tab/Model_iterate.csv')
p_all <-p_all[complete.cases(p_all[,3:4]),]

Subset <- p_all[!duplicated(p_all$Subset),]
Subset <- Subset[complete.cases(Subset[ , 11:12]),]

for (subset in 1:nrow(Subset)){
  p = as.list(Subset[subset,])
  # Apply rules for each subset
   if(p$Subset_rules_1_bin == 1){MUN_dat <- MUN_dat[which(MUN_dat$No_Source > 1),]
   }else{MUN_dat <- MUN_dat} # includes only data with more than one source
   if(p$Subset_rules_2_bin == 1){MUN_dat <- MUN_dat %>% filter(no_ha_per <= 10 & no_valor_per <= 10)
   }else{MUN_dat <- MUN_dat} # excludes municipalities with more than 10% missing data
   if(p$Subset_rules_3_bin == 1){MUN_dat <- MUN_dat[MUN_dat$MUN_COD_SHP != "CO44035" & MUN_dat$MUN_COD_SHP != "CO47001" & MUN_dat$MUN_COD_SHP != "CO05250",]
   }else{MUN_dat <- MUN_dat} # excludes outliers
   if(p$Subset_rules_4_bin == 1){MUN_dat[MUN_dat$MUN_COD_SHP != "CO11001",]
     }else{MUN_dat <- MUN_dat} # excludes Bogota
   if(p$Subset_rules_5_bin == 1){MUN_dat <- MUN_dat[(MUN_dat$W_Andes == 1),]
   }else{MUN_dat <- MUN_dat} # includes only municipalities within the Andes
   if(p$Subset_rules_6_bin == 1){MUN_dat <- MUN_dat[MUN_dat$DEP_CODE %in% c("CO41", "CO25", "CO66", "CO17", "CO05"),]
   }else{MUN_dat <- MUN_dat} # subsets departments
  
  # model variables
  if (p$W_Andes == 'Yes'){
      Factors <- MUN_dat %>% dplyr::select(Parcel,ha_best,valor_imp,P_informality,logVict,Metro,logIUA_AS_mea,NBI,No_Source, logAccess,
                                       logICA,ihs_ha_best,ihs_valor_imp,MUN_AVG_93_16,logIC,logICI,logICM,logICA,logICP,log_Pop,log_MUN_ha,
                                       logAVG_MUN_COST,ZS_ECOS_T_LIN,ZS_ECOS_T_LOG)}else{
      Factors <- MUN_dat %>% dplyr::select(Parcel,ha_best,valor_imp,P_informality,logVict,Metro,logIUA_AS_mea,NBI,No_Source, logAccess,
                                       logICA,ihs_ha_best,ihs_valor_imp,MUN_AVG_93_16,logIC,logICI,logICM,logICA,logICP,log_Pop,log_MUN_ha,
                                       ZS_ECOS_T_LIN,ZS_ECOS_T_LOG)
      } # If completely within the Andes,will include average cost of a ha in the figure
Factors <-Factors[complete.cases(Factors[,6:7]),]
pdf(paste('~/Dropbox/PhD/Paper_2/Figures/Subset_distributions/Variable_distributions_',subset,'.pdf',sep='')) 
par(mfrow=c(3, 3))
for (i in 1:length(Factors)) {
  colnames <- dimnames(Factors)[[2]]
  dt <- density(Factors[,i]) #treated
  hist(Factors[,i], xlim=c(0, max(Factors[,i])),ylim = c(0,max(dt$y)),
       breaks =50,
       main=colnames[i], probability=TRUE, col="gray", border="white",
       xlab=colnames[i])
  d <- density(Factors[,i])
  lines(d, col="red")
}
dev.off()
MUN_dat <- Save_dat
}

```

```{r all model runs}

#OPENS MODEL SPECIFICATION FILES

p_all <-read_csv('~/Dropbox/PhD/Paper_2/Data_tab/Model_iterations/Model_iterate_ALL.csv')

#ELIMINATES EMPTY EXCEL ROWS
p_all <- p_all[complete.cases(p_all[ , 3:4]),]

# Reset to full dataset

MUN_dat <- Save_dat

# Run models for each specification and save outputs

#Variance inflation factors
VIF <- setNames(data.frame(matrix(ncol = 8, nrow = 0)),
                c("term","estimate","std.error","statistic","p.value","conf.low","conf.high","model"))

# Model fit for linear models
Fit_lm <- setNames(data.frame(matrix(ncol = 3, nrow = 0)),
                c("names","x","model"))

# Coefficients
Coef <- setNames(data.frame(matrix(ncol = 14,nrow = 0)),
                 c("r.squared","adj.r.squared","sigma","statistic","p.value","df","logLik",
          "AIC","BIC","deviance","df.residual","nobs","model"))

# Model fit for generalized linear models
Fit_nb <- setNames(data.frame(matrix(ncol = 8, nrow = 0)),
                c("null.deviance", "df.null","logLik","AIC","BIC",
                  "deviance", "df.residual","nobs model"))

for (i_p in 1:nrow(p_all)) {
  p = as.list(p_all[i_p, ])
  print(paste("doing row", i_p, "of p_all"))
  
  # set outcome and variables
  outcome <- p$Outcome
  variables <- c(p$Vars)
  
  # subset data
   if (p$Subset_rules_1_bin == 1){MUN_dat <- MUN_dat[which(MUN_dat$No_Source > 1),]
   }else{MUN_dat <- MUN_dat} # includes only data with more than one source
   if (p$Subset_rules_2_bin == 1){MUN_dat <- MUN_dat %>% filter(no_ha_per <= 10 & no_valor_per <= 10)
   }else{MUN_dat <- MUN_dat} # excludes municipalities with more than 10% missing data
   if (p$Subset_rules_3_bin == 1){MUN_dat <- MUN_dat[MUN_dat$MUN_COD_SHP != "CO44035" & MUN_dat$MUN_COD_SHP != "CO47001" & MUN_dat$MUN_COD_SHP != "CO05250",]
   }else{MUN_dat <- MUN_dat} # excludes outliers
 if (p$Iteration == "REM_OUTLIER"){MUN_dat <- MUN_dat[MUN_dat$MUN_COD_SHP != "CO44035" & MUN_dat$MUN_COD_SHP != "CO47001" & MUN_dat$MUN_COD_SHP != "CO05250"& MUN_dat$MUN_COD_SHP != "CO68549" & MUN_dat$MUN_COD_SHP != "CO50686" & MUN_dat$MUN_COD_SHP != "CO05031"& MUN_dat$MUN_COD_SHP != "CO05670"& MUN_dat$MUN_COD_SHP != "CO17513" & MUN_dat$MUN_COD_SHP != "CO73024" & MUN_dat$MUN_COD_SHP != "CO17442"& MUN_dat$MUN_COD_SHP != "CO05125" & MUN_dat$MUN_COD_SHP != "CO05113" & MUN_dat$MUN_COD_SHP != "CO17050" & MUN_dat$MUN_COD_SHP != "CO73347" & MUN_dat$MUN_COD_SHP != "CO05501",]
   }else{MUN_dat <- MUN_dat} # excludes outliers
   if (p$Subset_rules_4_bin == 1){MUN_dat <- MUN_dat[MUN_dat$MUN_COD_SHP !="CO11001",] 
     }else{MUN_dat <- MUN_dat} # excludes Bogota
   if (p$Subset_rules_5_bin == 1){MUN_dat <- MUN_dat[(MUN_dat$W_Andes == 1),]
   }else{MUN_dat <- MUN_dat} # includes only municipalities within the Andes
   if (p$Subset_rules_6_bin == 1){MUN_dat <- MUN_dat[MUN_dat$DEP_CODE %in% c("CO41", "CO25", "CO66", "CO17", "CO05"),]
   }else{MUN_dat <- MUN_dat} # subsets departments
  
  # create model formula 
f_mod <- as.formula(
  paste(outcome,
        paste(variables, collapse = " + "),
        sep = " ~ "))

# Run glm.nb for Parcel outcome rows

if (p$Outcome == 'Parcel'|| p$Outcome =='Parcel_ha'){
    full.model <- glm.nb(f_mod, data = MUN_dat)} else{
    full.model <- lm(f_mod, data = MUN_dat)
    } # If the outcome is Parcel, will use negative binomial model


step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE)

# Pool model results 

# Model output, coefficients
combined.results1 <- bind_rows(
  tidy(step.model, conf.int = T) %>% mutate(model= p$Run_no, outcome = p$Outcome, vars = p$Vars, name = p$Name, subset_no = p$Subset, iteration = p$Iteration))

# Variance inflation factors
if (length(step.model$coef) > 2){
combined.results2 <- bind_rows(tidy(car::vif(step.model), conf.int = T) %>% mutate(model=p$Run_no, outcome = p$Outcome, vars = p$Vars, name = p$Name, subset_no = p$Subset, iteration = p$Iteration))} 
else {next}

# Model output, fit
combined.results3 <- bind_rows(
  glance(step.model, conf.int = T) %>% mutate(model=p$Run_no, outcome = p$Outcome, vars = p$Vars, name = p$Name, subset_no = p$Subset,iteration = p$Iteration))

# Bind results to dataframe 

Coef <- rbind(Coef, combined.results1)

if (p$Outcome == "Parcel"|| p$Outcome =='Parcel_ha'){Fit_nb <- rbind(Fit_nb, combined.results3)}
       else{Fit_lm <- rbind(Fit_lm, combined.results3)} # adds results to different dataframes depending on whether it is a linear model or a glm

MUN_dat <-Save_dat

}

# write model results to .csv

write.csv(Coef,'~/Dropbox/PhD/Paper_2/Model_results/Coef_ALL.csv', row.names = FALSE)
write.csv(VIF,'~/Dropbox/PhD/Paper_2/Model_results/VIF_ALL.csv', row.names = FALSE)
write.csv(Fit_lm,'~/Dropbox/PhD/Paper_2/Model_results/Fit_lm_ALL.csv', row.names = FALSE)
write.csv(Fit_nb,'~/Dropbox/PhD/Paper_2/Model_results/Fit_nb_ALL.csv', row.names = FALSE)

```

```{plot model coefficients and p-values}

Coef <- read.csv('~/Dropbox/PhD/Paper_2/Model_results/Coef_ALL.csv')

hist.vars = unique(Coef$term)[-1]
hist.outcomes = unique(Coef$outcome)

color_pallete_function <- colorRampPalette(
  colors = c("red", "orange", "blue"),)

Coef$subset_no <- as.factor(Coef$subset_no)
Coef$iteration <- as.factor(Coef$iteration)

num_colors <- nlevels(Coef$subset_no)
num_colors2 <-nlevels(Coef$iteration)
subset_colors <- color_pallete_function(num_colors)
iteration_colors <- color_pallete_function(num_colors2)

plot model coefs

for(o in 1:length(hist.outcomes)){
  par(mfrow=c(3,3))
 for(v in 1:length(hist.vars)){
    if(length(Coef[Coef$term==hist.vars[v] & Coef$outcome==hist.outcomes[o], "estimate"])>0){
          plot(Coef[Coef$term==hist.vars[v] & Coef$outcome==hist.outcomes[o], "estimate"],
               main = paste(hist.vars[v], ' (', hist.outcomes[o],')', sep = ''), xlab = "frequency", ylab = "estimate")
    }
  }
}

# plot model p values

for(o in 1:length(hist.outcomes)){
  par(mfrow=c(2,2))
  for(v in 1:length(hist.vars)){
    if(length(Coef[Coef$term==hist.vars[v] & Coef$outcome==hist.outcomes[o], "estimate"])>0){
          plot(Coef[Coef$term==hist.vars[v] & Coef$outcome==hist.outcomes[o], "p.value"],
               main = paste(hist.vars[v], ' (', hist.outcomes[o],')', sep = ''), xlab = "number of models", ylab = "p-value",col = subset_colors,pch = 20)
      abline(h=.05, col="red")
    }
  }
}

```
